Load the tidy data from disk if it is available. If it is not
available, run notebook-0-tidy.Rmd to create it.
df <- readRDS('../data/processed/tidy.Rds')
str(df)
## 'data.frame': 1537 obs. of 29 variables:
## $ owner_id : chr "0031b2aaeba16a973a7126a3919fedd6" "0031b2aaeba16a973a7126a3919fedd6" "0031b2aaeba16a973a7126a3919fedd6" "0031b2aaeba16a973a7126a3919fedd6" ...
## $ dog_name : chr "basso" "buddy" "cassandra" "chief" ...
## $ time_thinking : Factor w/ 4 levels "lt_one_wk","one_wk_to_six_mo",..: 2 2 2 2 2 3 3 2 2 2 ...
## $ primary_motivation : Factor w/ 8 levels "for_other_pet",..: 2 2 8 6 1 2 2 1 1 2 ...
## $ acquisition_source : Factor w/ 6 levels "foreign","breeder",..: 2 2 1 2 6 2 2 6 3 6 ...
## $ met_expectations : Ord.factor w/ 3 levels "no"<"partial"<..: 3 3 3 3 3 3 3 3 3 3 ...
## $ is_living_with_dog : logi FALSE FALSE TRUE TRUE TRUE TRUE ...
## $ time_together_len : Factor w/ 4 levels "lt_one_wk","one_wk_to_six_mo",..: 4 4 NA NA NA NA NA NA NA NA ...
## $ curr_dog_location : Factor w/ 6 levels "euthanized","died",..: 2 2 NA NA NA NA NA NA NA NA ...
## $ is_consider_another_dog : logi TRUE NA NA NA NA NA ...
## $ revised_acquisition_source: Factor w/ 6 levels "foreign","breeder",..: 2 NA NA NA NA NA 2 6 6 6 ...
## $ owner_gender : Factor w/ 2 levels "female","male": 1 1 1 1 1 1 1 2 2 1 ...
## $ owner_age : num 57 57 57 57 57 43 43 70 70 75 ...
## $ age_rank : num 2 2 3 1 4 3 6 5 5 7 ...
## $ appearance_rank : num 6 4 1 3 3 4 5 7 4 6 ...
## $ breed_rank : num 1 1 2 2 7 1 1 6 2 1 ...
## $ compatability_rank : num 3 5 4 4 1 5 3 2 6 4 ...
## $ personality_rank : num 5 6 5 5 6 6 4 1 1 2 ...
## $ size_rank : num 7 7 6 6 5 7 7 3 7 5 ...
## $ trainability_rank : num 7 7 7 7 7 2 2 4 3 3 ...
## $ revised_age_rank : num 2 NA NA NA NA NA 3 6 7 5 ...
## $ revised_appearance_rank : num 4 NA NA NA NA NA 7 5 1 6 ...
## $ revised_breed_rank : num 1 NA NA NA NA NA 4 4 2 1 ...
## $ revised_compatability_rank: num 5 NA NA NA NA NA 1 3 5 2 ...
## $ revised_personality_rank : num 3 NA NA NA NA NA 6 1 6 4 ...
## $ revised_size_rank : num 6 NA NA NA NA NA 5 7 3 7 ...
## $ revised_trainability_rank : num 7 NA NA NA NA NA 2 2 4 3 ...
## $ is_satisfied : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ is_owner_male : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
stopifnot(identical(dim(df)+0, c(1537, 29)))
A summary of the loaded data is provided below.
summary(df)
## owner_id dog_name time_thinking
## Length:1537 Length:1537 lt_one_wk :196
## Class :character Class :character one_wk_to_six_mo :752
## Mode :character Mode :character gt_six_mo_to_six_yr:528
## gt_six_yr : 61
##
##
##
## primary_motivation acquisition_source met_expectations
## companionship:846 foreign : 12 no : 7
## for_other_pet:224 breeder :480 partial: 248
## other :187 family_or_friend:130 yes :1282
## working :100 found : 51
## exercise : 77 pet_shop : 68
## family : 70 rescue :796
## (Other) : 33
## is_living_with_dog time_together_len curr_dog_location
## Mode :logical lt_one_wk : 1 euthanized : 34
## FALSE:190 one_wk_to_six_mo : 4 died : 142
## TRUE :1347 gt_six_mo_to_six_yr: 36 lost : 1
## gt_six_yr : 149 rehomed : 5
## NA's :1347 surrendered: 1
## other : 7
## NA's :1347
## is_consider_another_dog revised_acquisition_source owner_gender
## Mode :logical foreign : 4 female:1433
## FALSE:61 breeder :226 male : 104
## TRUE :876 family_or_friend: 10
## NA's :600 pet_shop : 8
## found : 6
## rescue :622
## NA's :661
## owner_age age_rank appearance_rank breed_rank
## Min. :18.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:42.00 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:1.000
## Median :54.00 Median :4.000 Median :5.000 Median :4.000
## Mean :52.31 Mean :4.314 Mean :4.541 Mean :3.632
## 3rd Qu.:64.00 3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:6.000
## Max. :85.00 Max. :7.000 Max. :7.000 Max. :7.000
##
## compatability_rank personality_rank size_rank trainability_rank
## Min. :1.000 Min. :1.0 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.0 1st Qu.:3.000 1st Qu.:3.000
## Median :5.000 Median :2.0 Median :4.000 Median :5.000
## Mean :4.492 Mean :2.5 Mean :4.281 Mean :4.651
## 3rd Qu.:7.000 3rd Qu.:3.0 3rd Qu.:6.000 3rd Qu.:6.000
## Max. :7.000 Max. :7.0 Max. :7.000 Max. :7.000
##
## revised_age_rank revised_appearance_rank revised_breed_rank
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:4.000 1st Qu.:3.000
## Median :5.000 Median :6.000 Median :5.000
## Mean :4.595 Mean :5.467 Mean :4.377
## 3rd Qu.:6.000 3rd Qu.:7.000 3rd Qu.:6.000
## Max. :7.000 Max. :7.000 Max. :7.000
## NA's :661 NA's :661 NA's :661
## revised_compatability_rank revised_personality_rank revised_size_rank
## Min. :1.000 Min. :1.00 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.00 1st Qu.:3.000
## Median :3.000 Median :2.00 Median :4.000
## Mean :3.334 Mean :2.16 Mean :4.301
## 3rd Qu.:5.000 3rd Qu.:3.00 3rd Qu.:6.000
## Max. :7.000 Max. :7.00 Max. :7.000
## NA's :661 NA's :661 NA's :661
## revised_trainability_rank is_satisfied is_owner_male
## Min. :1.000 Mode :logical Mode :logical
## 1st Qu.:3.000 FALSE:255 FALSE:1433
## Median :4.000 TRUE :1282 TRUE :104
## Mean :4.009
## 3rd Qu.:5.000
## Max. :7.000
## NA's :661
The number of participating owners is equal to the number of unique owner identifiers.
length(unique(df$owner_id))
## [1] 933
Generate a breakdown of owner gender.
df %>%
dplyr::distinct(owner_id, .keep_all=TRUE) %>%
dplyr::count(owner_gender) %>%
dplyr::mutate(freq = round(n / sum(n) * 100, 2)) %>%
janitor::adorn_totals("row")
## owner_gender n freq
## female 859 92.07
## male 74 7.93
## Total 933 100.00
Generate summary statistics for owner ages.
summary(
df %>%
dplyr::distinct(owner_id, .keep_all=TRUE) %>%
dplyr::select(owner_age)
)
## owner_age
## Min. :18.00
## 1st Qu.:42.00
## Median :53.00
## Mean :51.53
## 3rd Qu.:63.00
## Max. :85.00
Examine the distribution of owner ages.
df %>%
ggplot(aes(x=owner_age)) +
geom_histogram(alpha=0.5, position="identity", aes(y = ..density..),
color="black", bins=30) +
geom_density() +
geom_vline(xintercept=mean(df$owner_age), color="black", linetype="dashed",
size=1)
Examine the distribution of ages split by owner gender.
df %>%
dplyr::select(is_owner_male, owner_age) %>%
dplyr::transmute(male_age=ifelse(is_owner_male==TRUE, owner_age, NA),
female_age=ifelse(is_owner_male==FALSE, owner_age, NA)) %>%
summary()
## male_age female_age
## Min. :20.00 Min. :18.00
## 1st Qu.:52.75 1st Qu.:42.00
## Median :62.00 Median :54.00
## Mean :57.88 Mean :51.91
## 3rd Qu.:67.25 3rd Qu.:63.00
## Max. :75.00 Max. :85.00
## NA's :1433 NA's :104
means <- df %>%
dplyr::group_by(owner_gender) %>%
dplyr::summarise(means = mean(owner_age))
df %>%
ggplot(aes(owner_age, fill = owner_gender)) +
geom_histogram(alpha=0.7, position="identity", aes(y = ..density..),
color="black", bins=30) +
geom_density(alpha=0.7) +
geom_vline(xintercept=means$means, color=c("#F8766D", "#00BFC4"),
linetype="dashed", size=0.75)
The number of dogs corresponds to the number of rows in the data frame.
dim(df)[1]
## [1] 1537
Generate summary statistics for the number of dogs per household.
summary(as.data.frame(table(df$owner_id))$Freq)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.647 2.000 10.000
Determine the number of single dog households.
df %>%
dplyr::count(owner_id) %>%
dplyr::mutate(freq = n / sum(n)) %>%
dplyr::filter(n == 1) %>%
dplyr::mutate(freq = round(sum(freq) * 100, 2)) %>%
dplyr::count(freq)
## freq n
## 1 32.21 495
Generate breakdown of living situation.
df %>%
dplyr::count(is_living_with_dog) %>%
dplyr::mutate(freq = round(n / sum(n) * 100, 2)) %>%
janitor::adorn_totals("row")
## is_living_with_dog n freq
## FALSE 190 12.36
## TRUE 1347 87.64
## Total 1537 100.00
df %>%
dplyr::filter(is_living_with_dog == FALSE) %>%
dplyr::count(time_together_len) %>%
dplyr::mutate(freq = round(n / sum(n) * 100, 2)) %>%
janitor::adorn_totals("row")
## time_together_len n freq
## lt_one_wk 1 0.53
## one_wk_to_six_mo 4 2.11
## gt_six_mo_to_six_yr 36 18.95
## gt_six_yr 149 78.42
## Total 190 100.01
df %>%
dplyr::filter(is_living_with_dog == FALSE) %>%
dplyr::count(curr_dog_location) %>%
dplyr::mutate(freq = round(n / sum(n) * 100, 2)) %>%
janitor::adorn_totals("row")
## curr_dog_location n freq
## euthanized 34 17.89
## died 142 74.74
## lost 1 0.53
## rehomed 5 2.63
## surrendered 1 0.53
## other 7 3.68
## Total 190 100.00
Generate breakdown for meeting of expectations.
df %>%
dplyr::count(met_expectations) %>%
dplyr::mutate(freq = round(n / sum(n) * 100, 2)) %>%
janitor::adorn_totals("row")
## met_expectations n freq
## no 7 0.46
## partial 248 16.14
## yes 1282 83.41
## Total 1537 100.01
df %>%
dplyr::group_by(is_satisfied) %>%
dplyr::summarise(n = n()) %>%
dplyr::mutate(freq = round(n / sum(n) * 100, 2)) %>%
janitor::adorn_totals("row")
## is_satisfied n freq
## FALSE 255 16.59
## TRUE 1282 83.41
## Total 1537 100.00
Get count of owners with at least one dog that met expectations.
df %>%
dplyr::group_by(owner_id) %>%
dplyr::summarise(sat_cnt = sum(is_satisfied)) %>%
dplyr::filter(sat_cnt > 0)
## # A tibble: 843 × 2
## owner_id sat_cnt
## <chr> <int>
## 1 0031b2aaeba16a973a7126a3919fedd6 5
## 2 00f2854a7b7bcd35db54310d3d9ea142 2
## 3 0143addbe877065bb8d940e6e8901700 2
## 4 018b0b08b0a8dbc63f58c47b0c94d2e4 2
## 5 01a4ef3f5851f6796465f7678bcc8612 2
## 6 01b1aee4b9aade64e464d6fc40dc679b 1
## 7 01d2a80069fbbc5d0151d4b108cbcfed 1
## 8 01f155269919ba42302c9ce859842872 2
## 9 02093f7a704d1d81a701e841681c0cac 1
## 10 02096176f012a8064b0187ce4a946f7b 1
## # … with 833 more rows
Get a count of owners where all dogs met expectations.
df %>%
dplyr::group_by(owner_id) %>%
dplyr::summarise(sat_cnt = sum(is_satisfied), tot_cnt=n()) %>%
dplyr::filter(sat_cnt == tot_cnt)
## # A tibble: 707 × 3
## owner_id sat_cnt tot_cnt
## <chr> <int> <int>
## 1 0031b2aaeba16a973a7126a3919fedd6 5 5
## 2 00f2854a7b7bcd35db54310d3d9ea142 2 2
## 3 0143addbe877065bb8d940e6e8901700 2 2
## 4 018b0b08b0a8dbc63f58c47b0c94d2e4 2 2
## 5 01a4ef3f5851f6796465f7678bcc8612 2 2
## 6 01b1aee4b9aade64e464d6fc40dc679b 1 1
## 7 01d2a80069fbbc5d0151d4b108cbcfed 1 1
## 8 01f155269919ba42302c9ce859842872 2 2
## 9 02093f7a704d1d81a701e841681c0cac 1 1
## 10 025fdb64fcd4cf4a3a8b5a750b7abc07 2 2
## # … with 697 more rows
Generate breakdown of acquisition sources.
df %>%
dplyr::count(acquisition_source) %>%
dplyr::mutate(freq = round(n / sum(n) * 100, 2)) %>%
janitor::adorn_totals("row")
## acquisition_source n freq
## foreign 12 0.78
## breeder 480 31.23
## family_or_friend 130 8.46
## found 51 3.32
## pet_shop 68 4.42
## rescue 796 51.79
## Total 1537 100.00
Generate breakdown for time spent thinking.
df %>%
dplyr::count(time_thinking) %>%
dplyr::mutate(freq = round(n / sum(n) * 100, 2)) %>%
janitor::adorn_totals("row")
## time_thinking n freq
## lt_one_wk 196 12.75
## one_wk_to_six_mo 752 48.93
## gt_six_mo_to_six_yr 528 34.35
## gt_six_yr 61 3.97
## Total 1537 100.00
Generate breakdown for primary motivation.
df %>%
dplyr::count(primary_motivation) %>%
dplyr::mutate(freq = round(n / sum(n) * 100, 2)) %>%
janitor::adorn_totals("row")
## primary_motivation n freq
## for_other_pet 224 14.57
## companionship 846 55.04
## exercise 77 5.01
## protection 12 0.78
## social 21 1.37
## family 70 4.55
## working 100 6.51
## other 187 12.17
## Total 1537 100.00
Generate a summary for characteristic ranks.
df %>%
dplyr::select(contains("rank") & !contains("revised")) %>%
summary()
## age_rank appearance_rank breed_rank compatability_rank
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:2.000
## Median :4.000 Median :5.000 Median :4.000 Median :5.000
## Mean :4.314 Mean :4.541 Mean :3.632 Mean :4.492
## 3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:7.000
## Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.000
## personality_rank size_rank trainability_rank
## Min. :1.0 Min. :1.000 Min. :1.000
## 1st Qu.:1.0 1st Qu.:3.000 1st Qu.:3.000
## Median :2.0 Median :4.000 Median :5.000
## Mean :2.5 Mean :4.281 Mean :4.651
## 3rd Qu.:3.0 3rd Qu.:6.000 3rd Qu.:6.000
## Max. :7.0 Max. :7.000 Max. :7.000
Breakdown of owners that would consider getting another dog.
df %>%
dplyr::group_by(owner_id) %>%
tidyr::fill(is_consider_another_dog, .direction="downup") %>%
dplyr::ungroup() %>%
dplyr::distinct(owner_id, .keep_all=TRUE) %>%
dplyr::count(is_consider_another_dog) %>%
dplyr::mutate(freq = round(n / sum(n) * 100, 2)) %>%
janitor::adorn_totals("row")
## is_consider_another_dog n freq
## FALSE 61 6.54
## TRUE 851 91.21
## <NA> 21 2.25
## Total 933 100.00
Breakdown of fully satisfied owners considering another dog.
df %>%
dplyr::group_by(owner_id) %>%
dplyr::mutate(sat_cnt = sum(is_satisfied), tot_cnt=n()) %>%
dplyr::filter(sat_cnt == tot_cnt) %>%
tidyr::fill(is_consider_another_dog, .direction="downup") %>%
dplyr::ungroup() %>%
dplyr::distinct(owner_id, .keep_all=TRUE) %>%
dplyr::count(is_consider_another_dog) %>%
dplyr::mutate(freq = round(n / sum(n) * 100, 2)) %>%
janitor::adorn_totals("row")
## is_consider_another_dog n freq
## FALSE 44 6.22
## TRUE 644 91.09
## <NA> 19 2.69
## Total 707 100.00
Breakdown of owners satisfied with some, but not all, dogs considering another dog.
df %>%
dplyr::group_by(owner_id) %>%
dplyr::mutate(sat_cnt = sum(is_satisfied), tot_cnt=n()) %>%
dplyr::filter(sat_cnt < tot_cnt, sat_cnt > 0) %>%
tidyr::fill(is_consider_another_dog, .direction="downup") %>%
dplyr::ungroup() %>%
dplyr::distinct(owner_id, .keep_all=TRUE) %>%
dplyr::count(is_consider_another_dog) %>%
dplyr::mutate(freq = round(n / sum(n) * 100, 2)) %>%
janitor::adorn_totals("row")
## is_consider_another_dog n freq
## FALSE 4 2.94
## TRUE 131 96.32
## <NA> 1 0.74
## Total 136 100.00
Breakdown of owners not satisfied with any dogs considering another dog.
df %>%
dplyr::group_by(owner_id) %>%
dplyr::mutate(sat_cnt = sum(is_satisfied), tot_cnt=n()) %>%
dplyr::filter(sat_cnt == 0) %>%
tidyr::fill(is_consider_another_dog, .direction="downup") %>%
dplyr::ungroup() %>%
dplyr::distinct(owner_id, .keep_all=TRUE) %>%
dplyr::count(is_consider_another_dog) %>%
dplyr::mutate(freq = round(n / sum(n) * 100, 2)) %>%
janitor::adorn_totals("row")
## is_consider_another_dog n freq
## FALSE 13 14.44
## TRUE 76 84.44
## <NA> 1 1.11
## Total 90 99.99
Breakdown of revised acquisition source.
df %>%
dplyr::group_by(revised_acquisition_source) %>%
dplyr::summarise(n = n()) %>%
dplyr::mutate(freq = round(n / sum(n) * 100, 2)) %>%
janitor::adorn_totals("row")
## revised_acquisition_source n freq
## foreign 4 0.26
## breeder 226 14.70
## family_or_friend 10 0.65
## pet_shop 8 0.52
## found 6 0.39
## rescue 622 40.47
## <NA> 661 43.01
## Total 1537 100.00
Generate a summary for revised characteristic ranks.
df %>%
dplyr::select(contains("rank") & contains("revised")) %>%
summary()
## revised_age_rank revised_appearance_rank revised_breed_rank
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:4.000 1st Qu.:3.000
## Median :5.000 Median :6.000 Median :5.000
## Mean :4.595 Mean :5.467 Mean :4.377
## 3rd Qu.:6.000 3rd Qu.:7.000 3rd Qu.:6.000
## Max. :7.000 Max. :7.000 Max. :7.000
## NA's :661 NA's :661 NA's :661
## revised_compatability_rank revised_personality_rank revised_size_rank
## Min. :1.000 Min. :1.00 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.00 1st Qu.:3.000
## Median :3.000 Median :2.00 Median :4.000
## Mean :3.334 Mean :2.16 Mean :4.301
## 3rd Qu.:5.000 3rd Qu.:3.00 3rd Qu.:6.000
## Max. :7.000 Max. :7.00 Max. :7.000
## NA's :661 NA's :661 NA's :661
## revised_trainability_rank
## Min. :1.000
## 1st Qu.:3.000
## Median :4.000
## Mean :4.009
## 3rd Qu.:5.000
## Max. :7.000
## NA's :661
Difference in rank between revised rank and average of prior ranks.
df %>%
dplyr::select(contains("rank")) %>%
dplyr::summarise(
dplyr::across(
everything(),~ janitor::round_half_up(mean(., na.rm = TRUE), 2))) %>%
dplyr::mutate(
age_delta = revised_age_rank - age_rank,
app_delta = revised_appearance_rank - appearance_rank,
breed_delta = revised_breed_rank - breed_rank,
comp_delta = revised_compatability_rank - compatability_rank,
pers_delta = revised_personality_rank - personality_rank,
size_delta = revised_size_rank - size_rank,
train_delta = revised_trainability_rank - trainability_rank) %>%
summary()
## age_rank appearance_rank breed_rank compatability_rank
## Min. :4.31 Min. :4.54 Min. :3.63 Min. :4.49
## 1st Qu.:4.31 1st Qu.:4.54 1st Qu.:3.63 1st Qu.:4.49
## Median :4.31 Median :4.54 Median :3.63 Median :4.49
## Mean :4.31 Mean :4.54 Mean :3.63 Mean :4.49
## 3rd Qu.:4.31 3rd Qu.:4.54 3rd Qu.:3.63 3rd Qu.:4.49
## Max. :4.31 Max. :4.54 Max. :3.63 Max. :4.49
## personality_rank size_rank trainability_rank revised_age_rank
## Min. :2.5 Min. :4.28 Min. :4.65 Min. :4.59
## 1st Qu.:2.5 1st Qu.:4.28 1st Qu.:4.65 1st Qu.:4.59
## Median :2.5 Median :4.28 Median :4.65 Median :4.59
## Mean :2.5 Mean :4.28 Mean :4.65 Mean :4.59
## 3rd Qu.:2.5 3rd Qu.:4.28 3rd Qu.:4.65 3rd Qu.:4.59
## Max. :2.5 Max. :4.28 Max. :4.65 Max. :4.59
## revised_appearance_rank revised_breed_rank revised_compatability_rank
## Min. :5.47 Min. :4.38 Min. :3.33
## 1st Qu.:5.47 1st Qu.:4.38 1st Qu.:3.33
## Median :5.47 Median :4.38 Median :3.33
## Mean :5.47 Mean :4.38 Mean :3.33
## 3rd Qu.:5.47 3rd Qu.:4.38 3rd Qu.:3.33
## Max. :5.47 Max. :4.38 Max. :3.33
## revised_personality_rank revised_size_rank revised_trainability_rank
## Min. :2.16 Min. :4.3 Min. :4.01
## 1st Qu.:2.16 1st Qu.:4.3 1st Qu.:4.01
## Median :2.16 Median :4.3 Median :4.01
## Mean :2.16 Mean :4.3 Mean :4.01
## 3rd Qu.:2.16 3rd Qu.:4.3 3rd Qu.:4.01
## Max. :2.16 Max. :4.3 Max. :4.01
## age_delta app_delta breed_delta comp_delta pers_delta
## Min. :0.28 Min. :0.93 Min. :0.75 Min. :-1.16 Min. :-0.34
## 1st Qu.:0.28 1st Qu.:0.93 1st Qu.:0.75 1st Qu.:-1.16 1st Qu.:-0.34
## Median :0.28 Median :0.93 Median :0.75 Median :-1.16 Median :-0.34
## Mean :0.28 Mean :0.93 Mean :0.75 Mean :-1.16 Mean :-0.34
## 3rd Qu.:0.28 3rd Qu.:0.93 3rd Qu.:0.75 3rd Qu.:-1.16 3rd Qu.:-0.34
## Max. :0.28 Max. :0.93 Max. :0.75 Max. :-1.16 Max. :-0.34
## size_delta train_delta
## Min. :0.02 Min. :-0.64
## 1st Qu.:0.02 1st Qu.:-0.64
## Median :0.02 Median :-0.64
## Mean :0.02 Mean :-0.64
## 3rd Qu.:0.02 3rd Qu.:-0.64
## Max. :0.02 Max. :-0.64
pairs.panels(df[,12:19])
Split by owner gender.
df %>%
dplyr::count(owner_gender, is_satisfied) %>%
tidyr::spread(is_satisfied, n) %>%
janitor::adorn_totals("row") %>%
janitor::adorn_totals("col")
## owner_gender FALSE TRUE Total
## female 239 1194 1433
## male 16 88 104
## Total 255 1282 1537
df %>%
dplyr::group_by(owner_gender) %>%
dplyr::mutate(freq = sum(is_satisfied) / n()) %>%
ggplot(aes(x=reorder(owner_gender, desc(freq)), fill=is_satisfied)) +
geom_bar(position="fill") +
xlab("owner gender") +
ylab("frequency")
Split by owner age.
means <- df %>%
dplyr::group_by(is_satisfied) %>%
dplyr::summarise(means = mean(owner_age))
df %>%
ggplot(aes(owner_age, fill = is_satisfied)) +
geom_histogram(alpha=0.7, position="identity", aes(y = ..density..),
color="black", bins=30) +
geom_density(alpha=0.7) +
geom_vline(xintercept=means$means, color=c("#F8766D", "#00BFC4"),
linetype="dashed", size=0.75)
Split by acquisition source.
df %>%
dplyr::count(acquisition_source, is_satisfied) %>%
tidyr::spread(is_satisfied, n) %>%
janitor::adorn_totals("row") %>%
janitor::adorn_totals("col")
## acquisition_source FALSE TRUE Total
## foreign 2 10 12
## breeder 56 424 480
## family_or_friend 27 103 130
## found 7 44 51
## pet_shop 11 57 68
## rescue 152 644 796
## Total 255 1282 1537
df %>%
dplyr::group_by(acquisition_source) %>%
dplyr::mutate(freq = sum(is_satisfied) / n()) %>%
dplyr::mutate(
acquisition_source = fct_recode(
acquisition_source,
"family/friend"="family_or_friend",
"pet shop"="pet_shop",
"shelter/rescue"="rescue"
)
) %>%
ggplot(aes(x=reorder(acquisition_source, desc(freq)), fill=is_satisfied)) +
geom_bar(position="fill") +
xlab("acquisition source") +
ylab("frequency") +
theme(axis.text.x = element_text(angle = 45, hjust=1))
Split by living situation.
df %>%
dplyr::count(is_living_with_dog, is_satisfied) %>%
tidyr::spread(is_satisfied, n) %>%
janitor::adorn_totals("row") %>%
janitor::adorn_totals("col")
## is_living_with_dog FALSE TRUE Total
## FALSE 32 158 190
## TRUE 223 1124 1347
## Total 255 1282 1537
df %>%
dplyr::group_by(is_living_with_dog) %>%
dplyr::mutate(freq = sum(is_satisfied) / n()) %>%
ggplot(aes(x=reorder(is_living_with_dog, desc(freq)), fill=is_satisfied)) +
geom_bar(position="fill") +
xlab("living with dog") +
ylab("frequency")
Split by time spent thinking.
df %>%
dplyr::count(time_thinking, is_satisfied) %>%
tidyr::spread(is_satisfied, n) %>%
janitor::adorn_totals("row") %>%
janitor::adorn_totals("col")
## time_thinking FALSE TRUE Total
## lt_one_wk 28 168 196
## one_wk_to_six_mo 127 625 752
## gt_six_mo_to_six_yr 84 444 528
## gt_six_yr 16 45 61
## Total 255 1282 1537
df %>%
dplyr::group_by(time_thinking) %>%
dplyr::mutate(freq = sum(is_satisfied) / n()) %>%
ggplot(aes(x=reorder(time_thinking, desc(freq)), fill=is_satisfied)) +
geom_bar(position="fill") +
xlab("time spent thinking") +
ylab("frequency")
Split by primary motivation.
df %>%
dplyr::count(primary_motivation, is_satisfied) %>%
tidyr::spread(is_satisfied, n) %>%
janitor::adorn_totals("row") %>%
janitor::adorn_totals("col")
## primary_motivation FALSE TRUE Total
## for_other_pet 43 181 224
## companionship 102 744 846
## exercise 19 58 77
## protection 4 8 12
## social 4 17 21
## family 21 49 70
## working 18 82 100
## other 44 143 187
## Total 255 1282 1537
df %>%
dplyr::group_by(primary_motivation) %>%
dplyr::mutate(freq = sum(is_satisfied) / n()) %>%
ggplot(aes(x=reorder(primary_motivation, desc(freq)), fill=is_satisfied)) +
geom_bar(position="fill") +
xlab("primary motivation") +
ylab("frequency") +
theme(axis.text.x = element_text(angle = 45, hjust=1))
Split by consideration of another dog.
df %>%
dplyr::filter(!is.na(is_consider_another_dog)) %>%
dplyr::count(is_consider_another_dog, is_satisfied) %>%
tidyr::spread(is_satisfied, n) %>%
janitor::adorn_totals("row") %>%
janitor::adorn_totals("col")
## is_consider_another_dog FALSE TRUE Total
## FALSE 15 46 61
## TRUE 156 720 876
## Total 171 766 937
df %>%
dplyr::filter(!is.na(is_consider_another_dog)) %>%
dplyr::group_by(is_consider_another_dog) %>%
dplyr::mutate(freq = sum(is_satisfied) / n()) %>%
ggplot(aes(x=reorder(is_consider_another_dog, desc(freq)), fill=is_satisfied)) +
geom_bar(position="fill") +
xlab("would consider an additional dog") +
ylab("frequency") +
theme(axis.text.x = element_text(angle = 45, hjust=1))
Split by ranks.
featurePlot(
x = df[, 13:19],
y = factor(df$is_satisfied),
plot = "box",
scales = list(y = list(relation="free"), x = list(rot = 90)),
layout = c(4,2),
auto.key = list(columns = 2)
)
Split by revised ranks.
featurePlot(
x = df[, 20:26],
y = factor(df$is_satisfied),
plot = "box",
scales = list(y = list(relation="free"), x = list(rot = 90)),
layout = c(4,2),
auto.key = list(columns = 2)
)
Split by source.
featurePlot(
x = df[, 13:19],
y = factor(df$acquisition_source),
plot = "box",
scales = list(y = list(relation="free"), x = list(rot = 90)),
layout = c(4,2),
auto.key = list(columns = 2)
)
table(df$primary_motivation, df$time_thinking)
##
## lt_one_wk one_wk_to_six_mo gt_six_mo_to_six_yr gt_six_yr
## for_other_pet 22 145 54 3
## companionship 74 401 326 45
## exercise 2 37 33 5
## protection 0 7 5 0
## social 2 9 10 0
## family 9 38 22 1
## working 5 47 47 1
## other 82 68 31 6
table(df$acquisition_source, df$time_thinking)
##
## lt_one_wk one_wk_to_six_mo gt_six_mo_to_six_yr gt_six_yr
## foreign 2 6 4 0
## breeder 23 192 234 31
## family_or_friend 35 63 25 7
## found 24 17 9 1
## pet_shop 7 38 20 3
## rescue 105 436 236 19
Plot of background variables to be used in model.
#mosaic(~ owner_gender + primary_motivation + is_satisfied, data=df,
# highlighting = "is_satisfied", highlighting_fill = c("lightblue", "pink"),
# direction = c("h", "v", "v"))
df_long <- df %>%
dplyr::select(owner_gender, is_satisfied, dplyr::contains("rank"), time_thinking) %>%
rename_with(~ gsub("revised_(.*)", "\\1_revised", .)) %>%
tidyr::pivot_longer(
cols=age_rank:trainability_rank_revised,
names_to=c("char", "revised"),
names_pattern="(.*)_rank_?(revised)?",
values_to="rank",
values_drop_na=TRUE
) %>%
dplyr::mutate(
Adoption=factor(ifelse(revised =="revised", "future", "past"),
levels=c("past", "future"))) %>%
dplyr::mutate(char=as.factor(char))
width <- 5
df_long %>%
ggplot(aes(x=char, y=rank, fill=Adoption)) +
introdataviz::geom_split_violin(alpha=.5, trim=T) +
geom_boxplot(width=.125, alpha=.8, show.legend=FALSE) +
scale_x_discrete(name = "Characteristic") +
scale_y_continuous(name = "Rank",
limits = c(1, 7),
breaks = seq(1, 7, 1))
ggsave("fig-1-violin-plot.pdf")
## Saving 7 x 5 in image
df_long %>%
dplyr::filter(Adoption=="past") %>%
ggplot(aes(x=rank, y=char, fill = 0.5 - abs(0.5 - stat(ecdf)))) +
stat_density_ridges(geom = "density_ridges_gradient", calc_ecdf = TRUE) +
scale_fill_viridis_c(name = "Tail probability", direction = -1)
## Picking joint bandwidth of 0.386
df_long %>%
dplyr::filter(Adoption=="future") %>%
ggplot(aes(x=rank, y=char, fill = 0.5 - abs(0.5 - stat(ecdf)))) +
stat_density_ridges(geom = "density_ridges_gradient", calc_ecdf = TRUE) +
scale_fill_viridis_c(name = "Tail probability", direction = -1)
## Picking joint bandwidth of 0.398
df_long %>%
ggplot(aes(x=rank, y=char, fill=Adoption,
group=interaction(char, !ifelse(Adoption=="past", F, T)))) +
ggridges::geom_density_ridges(alpha=0.5) +
theme_ridges() +
labs(
x = "Rank",
y = "Characteristic",
)
## Picking joint bandwidth of 0.392
ggsave("fig-1-ridgeline-plot.pdf")
## Saving 7 x 5 in image
## Picking joint bandwidth of 0.392
df_long2 <- df_long %>%
mutate(char2 = 2*as.numeric(as.factor(char))+0.8*as.numeric(as.factor(Adoption)))
df_long2 %>%
ggplot(aes(x=char2, y=rank)) +
geom_jitter(aes(color=Adoption), width=0.3, alpha=0.3) +
geom_boxplot(aes(x=char2, y=rank,
group=interaction(char, Adoption)),
width=0.35, alpha=0.1) +
scale_x_continuous(breaks=unique(df_long2$char2[1:7])+.4,
labels=unique(df_long2$char)) +
labs(
y = "Rank",
x = "Characteristic",
) +
guides(colour = guide_legend(override.aes = list(alpha = 1)))
ggsave("fig-1-jitter-plot.pdf")
## Saving 7 x 5 in image
df_long2 <- df_long %>%
mutate(char2 = 2*as.numeric(as.factor(char))+0.8*as.numeric(as.factor(Adoption)))
df_long2 %>%
ggplot(aes(x=char2, y=rank)) +
#geom_jitter(aes(color=Adoption), width=0.3, alpha=0.3) +
geom_boxplot(aes(x=char2, y=rank,
group=interaction(char, Adoption),
color=Adoption),
width=0.35, alpha=0.1) +
scale_x_continuous(breaks=unique(df_long2$char2[1:7])+.4,
labels=unique(df_long2$char)) +
labs(
y = "Rank",
x = "Characteristic",
) +
guides(colour = guide_legend(override.aes = list(alpha = 1)))
ggsave("fig-1-box-plot.pdf")
## Saving 7 x 5 in image
df_long2 %>%
ggplot(aes(x=char, y=rank, fill=Adoption)) +
geom_boxplot() +
labs(
y = "Rank",
x = "Characteristic",
)
ggsave("fig-1-box-plot.pdf")
## Saving 7 x 5 in image
df_long_totals <- df_long %>%
dplyr::select(owner_gender, is_satisfied, time_thinking) %>%
plyr::ddply(.(owner_gender, time_thinking), summarize, is_satisfied=sum(is_satisfied))
titanic.colors<-list("gray90",c("#0000ff","#7700ee","#aa00cc","#dd00aa"),
c("#ddcc00","#ee9900"),c("pink","lightblue"))
barNest(
is_satisfied~owner_gender+time_thinking,
data=df_long_totals,
col=titanic.colors,
#showall=TRUE,
main="Owner satisfaction by owner gender, age, and forethought",
ylab="Proportion satisfied"
#FUN=c("propbrk")#,"binciWu","binciWl","valid.n"),
#shrink=0.15
#trueval="Yes"
)
df_pca <- df %>%
dplyr::select(contains("rank") & !contains("revised"))
pc <- prcomp(df_pca, center = TRUE, scale = TRUE)
attributes(pc)
## $names
## [1] "sdev" "rotation" "center" "scale" "x"
##
## $class
## [1] "prcomp"
print(pc)
## Standard deviations (1, .., p=7):
## [1] 1.3320830 1.1654028 1.0729902 1.0003757 0.9554394 0.8414860 0.3071946
##
## Rotation (n x k) = (7 x 7):
## PC1 PC2 PC3 PC4 PC5
## age_rank 0.2763080 -0.18350266 0.50546070 -0.633087619 0.30842543
## appearance_rank 0.4510782 -0.07276318 -0.37139463 0.436724495 0.46081320
## breed_rank 0.2979285 0.69415885 0.07858535 -0.005516831 -0.18557153
## compatability_rank -0.4407692 -0.16162426 0.54026406 0.506224362 -0.06710822
## personality_rank -0.4440301 -0.16790994 -0.34920570 -0.216328331 0.47665141
## size_rank 0.2257576 -0.56457471 -0.26666500 -0.113386110 -0.62307390
## trainability_rank -0.4347577 0.32575304 -0.33979371 -0.304177390 -0.19536375
## PC6 PC7
## age_rank 0.19622592 -0.3163216
## appearance_rank 0.32449813 -0.3806887
## breed_rank -0.42870529 -0.4527151
## compatability_rank 0.05345027 -0.4733828
## personality_rank -0.54379407 -0.2880839
## size_rank -0.14332111 -0.3708912
## trainability_rank 0.59438899 -0.3246872
stats::biplot(pc, scale = 0)
# Explained variance.
round(pc$sdev^2 / sum(pc$sdev^2), 2)
## [1] 0.25 0.19 0.16 0.14 0.13 0.10 0.01
g <- ggbiplot(pc,
obs.scale = 1,
var.scale = 1,
groups = df$is_satisfied,
ellipse = TRUE,
circle = TRUE,
ellipse.prob = 0.68)
g <- g + scale_color_discrete(name = '')
g <- g + theme(legend.direction = 'horizontal',
legend.position = 'top')
print(g)
pc <- PCAtools::pca(df_pca)
PCAtools::screeplot(pc)
PCAtools::biplot(pc, showLoadings = T)
PCAtools::plotloadings(pc, labSize = 3)
## -- variables retained:
## 27, 81, 87, 150, 153, 169, 173, 194, 207, 249, 253, 317, 357, 368, 395, 419, 435, 454, 518, 529, 559, 570, 598, 609, 675, 760, 787, 851, 894, 1019, 1028, 1034, 1085, 1116, 1173, 1227, 1230, 1233, 1276, 1304, 1315, 1410, 1523, 1525, 23, 69, 78, 304, 306, 307, 308, 309, 310, 316, 319, 471, 552, 934, 974, 975, 981, 982, 1006, 1015, 1107, 1110, 1158, 1180, 1290, 1357, 1419, 165, 198, 205, 483, 535, 560, 705, 824, 843, 864, 1178, 1255, 1257, 1515, 25, 1372, 555, 850, 21, 720, 1104, 1404, 74, 556, 733, 859, 1204, 84, 256, 779, 1065, 1105, 1141, 1226, 1261, 1306, 1334, 144, 283, 572, 677, 699, 754, 805, 1441, 248, 619, 993
## Warning: ggrepel: 579 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
pairsplot(
pc,
triangle = TRUE,
trianglelabSize = 12,
hline = 0,
vline = 0,
pointSize = 1
)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
df_rank <- df[,c(13, 18:19)]
fa(r=cor(df_rank), nfactors=dim(df_rank)[2], rotate="varimax", SMC=FALSE, fm="pa")
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Factor Analysis using method = pa
## Call: fa(r = cor(df_rank), nfactors = dim(df_rank)[2], rotate = "varimax",
## SMC = FALSE, fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
## PA2 PA1 PA3 h2 u2 com
## owner_age 1.00 0.04 0.04 1 3.3e-16 1
## personality_rank 0.04 1.00 -0.06 1 5.6e-16 1
## size_rank 0.04 -0.06 1.00 1 3.3e-16 1
##
## PA2 PA1 PA3
## SS loadings 1.00 1.00 1.00
## Proportion Var 0.33 0.33 0.33
## Cumulative Var 0.33 0.67 1.00
## Proportion Explained 0.33 0.33 0.33
## Cumulative Proportion 0.33 0.67 1.00
##
## Mean item complexity = 1
## Test of the hypothesis that 3 factors are sufficient.
##
## The degrees of freedom for the null model are 3 and the objective function was 0.03
## The degrees of freedom for the model are -3 and the objective function was 0
##
## The root mean square of the residuals (RMSR) is 0
## The df corrected root mean square of the residuals is NA
##
## Fit based upon off diagonal values = 1
sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: aarch64-apple-darwin21.3.0 (64-bit)
## Running under: macOS Monterey 12.4
##
## Matrix products: default
## BLAS: /opt/homebrew/Cellar/openblas/0.3.20/lib/libopenblasp-r0.3.20.dylib
## LAPACK: /opt/homebrew/Cellar/r/4.2.0/lib/R/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] introdataviz_0.0.0.9003 ggmosaic_0.3.4 PCAtools_2.5.15
## [4] ggrepel_0.9.1 ggbiplot_0.55 scales_1.2.0
## [7] plyr_1.8.7 devtools_2.4.3 usethis_2.1.6
## [10] plotrix_3.8-2 ggridges_0.5.3 vcd_1.4-10
## [13] caret_6.0-92 lattice_0.20-45 psych_2.2.5
## [16] janitor_2.1.0 forcats_0.5.1 stringr_1.4.0
## [19] dplyr_1.0.9 purrr_0.3.4 readr_2.1.2
## [22] tidyr_1.2.0 tibble_3.1.7 ggplot2_3.3.6
## [25] tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] readxl_1.4.0 backports_1.4.1
## [3] lazyeval_0.2.2 splines_4.2.0
## [5] BiocParallel_1.30.3 listenv_0.8.0
## [7] digest_0.6.29 foreach_1.5.2
## [9] htmltools_0.5.2 fansi_1.0.3
## [11] magrittr_2.0.3 memoise_2.0.1
## [13] ScaledMatrix_1.4.0 tzdb_0.3.0
## [15] remotes_2.4.2 recipes_0.2.0
## [17] globals_0.15.0 modelr_0.1.8
## [19] gower_1.0.0 matrixStats_0.62.0
## [21] hardhat_1.1.0 prettyunits_1.1.1
## [23] colorspace_2.0-3 rvest_1.0.2
## [25] haven_2.5.0 xfun_0.30
## [27] callr_3.7.0 crayon_1.5.1
## [29] jsonlite_1.8.0 survival_3.3-1
## [31] zoo_1.8-10 iterators_1.0.14
## [33] glue_1.6.2 gtable_0.3.0
## [35] ipred_0.9-12 DelayedArray_0.22.0
## [37] pkgbuild_1.3.1 BiocSingular_1.12.0
## [39] future.apply_1.9.0 BiocGenerics_0.42.0
## [41] DBI_1.1.3 Rcpp_1.0.8.3
## [43] viridisLite_0.4.0 dqrng_0.3.0
## [45] rsvd_1.0.5 stats4_4.2.0
## [47] lava_1.6.10 prodlim_2019.11.13
## [49] htmlwidgets_1.5.4 httr_1.4.3
## [51] ellipsis_0.3.2 farver_2.1.1
## [53] pkgconfig_2.0.3 nnet_7.3-17
## [55] sass_0.4.1 dbplyr_2.2.0
## [57] utf8_1.2.2 labeling_0.4.2
## [59] tidyselect_1.1.2 rlang_1.0.3
## [61] reshape2_1.4.4 munsell_0.5.0
## [63] cellranger_1.1.0 tools_4.2.0
## [65] cachem_1.0.6 cli_3.3.0
## [67] generics_0.1.2 broom_0.8.0
## [69] evaluate_0.15 fastmap_1.1.0
## [71] yaml_2.3.5 ModelMetrics_1.2.2.2
## [73] processx_3.5.3 knitr_1.39
## [75] fs_1.5.2 sparseMatrixStats_1.8.0
## [77] future_1.25.0 nlme_3.1-157
## [79] xml2_1.3.3 brio_1.1.3
## [81] compiler_4.2.0 rstudioapi_0.13
## [83] plotly_4.10.0 curl_4.3.2
## [85] testthat_3.1.4 reprex_2.0.1
## [87] bslib_0.3.1 stringi_1.7.6
## [89] highr_0.9 ps_1.7.0
## [91] desc_1.4.1 Matrix_1.4-1
## [93] vctrs_0.4.1 pillar_1.7.0
## [95] lifecycle_1.0.1 lmtest_0.9-40
## [97] jquerylib_0.1.4 irlba_2.3.5
## [99] data.table_1.14.2 cowplot_1.1.1
## [101] R6_2.5.1 IRanges_2.30.0
## [103] parallelly_1.31.1 sessioninfo_1.2.2
## [105] codetools_0.2-18 MASS_7.3-57
## [107] assertthat_0.2.1 pkgload_1.2.4
## [109] rprojroot_2.0.3 withr_2.5.0
## [111] mnormt_2.1.0 S4Vectors_0.34.0
## [113] parallel_4.2.0 hms_1.1.1
## [115] beachmat_2.12.0 rpart_4.1.16
## [117] timeDate_3043.102 class_7.3-20
## [119] DelayedMatrixStats_1.18.0 rmarkdown_2.14
## [121] snakecase_0.11.0 MatrixGenerics_1.8.1
## [123] pROC_1.18.0 lubridate_1.8.0
rm(list = ls())